1   package org.apache.lucene.index;
2   
3   /*
4    * Licensed to the Apache Software Foundation (ASF) under one or more
5    * contributor license agreements.  See the NOTICE file distributed with
6    * this work for additional information regarding copyright ownership.
7    * The ASF licenses this file to You under the Apache License, Version 2.0
8    * (the "License"); you may not use this file except in compliance with
9    * the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  import java.io.IOException;
20  import java.util.ArrayList;
21  import java.util.Arrays;
22  
23  import org.apache.lucene.analysis.MockAnalyzer;
24  import org.apache.lucene.document.Document;
25  import org.apache.lucene.document.Field;
26  import org.apache.lucene.document.FieldType;
27  import org.apache.lucene.document.TextField;
28  import org.apache.lucene.search.DocIdSetIterator;
29  import org.apache.lucene.store.Directory;
30  import org.apache.lucene.util.Bits;
31  import org.apache.lucene.util.BytesRef;
32  import org.apache.lucene.util.LuceneTestCase;
33  import org.apache.lucene.util.TestUtil;
34  
35  public class TestDocsAndPositions extends LuceneTestCase {
36    private String fieldName;
37  
38    @Override
39    public void setUp() throws Exception {
40      super.setUp();
41      fieldName = "field" + random().nextInt();
42    }
43  
44    /**
45     * Simple testcase for {@link PostingsEnum}
46     */
47    public void testPositionsSimple() throws IOException {
48      Directory directory = newDirectory();
49      RandomIndexWriter writer = new RandomIndexWriter(random(), directory,
50          newIndexWriterConfig(new MockAnalyzer(random())));
51      for (int i = 0; i < 39; i++) {
52        Document doc = new Document();
53        FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
54        customType.setOmitNorms(true);
55        doc.add(newField(fieldName, "1 2 3 4 5 6 7 8 9 10 "
56            + "1 2 3 4 5 6 7 8 9 10 " + "1 2 3 4 5 6 7 8 9 10 "
57            + "1 2 3 4 5 6 7 8 9 10", customType));
58        writer.addDocument(doc);
59      }
60      IndexReader reader = writer.getReader();
61      writer.close();
62  
63      int num = atLeast(13);
64      for (int i = 0; i < num; i++) {
65        BytesRef bytes = new BytesRef("1");
66        IndexReaderContext topReaderContext = reader.getContext();
67        for (LeafReaderContext leafReaderContext : topReaderContext.leaves()) {
68          PostingsEnum docsAndPosEnum = getDocsAndPositions(
69              leafReaderContext.reader(), bytes);
70          assertNotNull(docsAndPosEnum);
71          if (leafReaderContext.reader().maxDoc() == 0) {
72            continue;
73          }
74          final int advance = docsAndPosEnum.advance(random().nextInt(leafReaderContext.reader().maxDoc()));
75          do {
76            String msg = "Advanced to: " + advance + " current doc: "
77                + docsAndPosEnum.docID(); // TODO: + " usePayloads: " + usePayload;
78            assertEquals(msg, 4, docsAndPosEnum.freq());
79            assertEquals(msg, 0, docsAndPosEnum.nextPosition());
80            assertEquals(msg, 4, docsAndPosEnum.freq());
81            assertEquals(msg, 10, docsAndPosEnum.nextPosition());
82            assertEquals(msg, 4, docsAndPosEnum.freq());
83            assertEquals(msg, 20, docsAndPosEnum.nextPosition());
84            assertEquals(msg, 4, docsAndPosEnum.freq());
85            assertEquals(msg, 30, docsAndPosEnum.nextPosition());
86          } while (docsAndPosEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
87        }
88      }
89      reader.close();
90      directory.close();
91    }
92  
93    public PostingsEnum getDocsAndPositions(LeafReader reader,
94        BytesRef bytes) throws IOException {
95      Terms terms = reader.terms(fieldName);
96      if (terms != null) {
97        TermsEnum te = terms.iterator();
98        if (te.seekExact(bytes)) {
99          return te.postings(null, PostingsEnum.ALL);
100       }
101     }
102     return null;
103   }
104 
105   /**
106    * this test indexes random numbers within a range into a field and checks
107    * their occurrences by searching for a number from that range selected at
108    * random. All positions for that number are saved up front and compared to
109    * the enums positions.
110    */
111   public void testRandomPositions() throws IOException {
112     Directory dir = newDirectory();
113     RandomIndexWriter writer = new RandomIndexWriter(random(), dir,
114         newIndexWriterConfig(new MockAnalyzer(random()))
115           .setMergePolicy(newLogMergePolicy()));
116     int numDocs = atLeast(47);
117     int max = 1051;
118     int term = random().nextInt(max);
119     Integer[][] positionsInDoc = new Integer[numDocs][];
120     FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
121     customType.setOmitNorms(true);
122     for (int i = 0; i < numDocs; i++) {
123       Document doc = new Document();
124       ArrayList<Integer> positions = new ArrayList<>();
125       StringBuilder builder = new StringBuilder();
126       int num = atLeast(131);
127       for (int j = 0; j < num; j++) {
128         int nextInt = random().nextInt(max);
129         builder.append(nextInt).append(" ");
130         if (nextInt == term) {
131           positions.add(Integer.valueOf(j));
132         }
133       }
134       if (positions.size() == 0) {
135         builder.append(term);
136         positions.add(num);
137       }
138       doc.add(newField(fieldName, builder.toString(), customType));
139       positionsInDoc[i] = positions.toArray(new Integer[0]);
140       writer.addDocument(doc);
141     }
142 
143     IndexReader reader = writer.getReader();
144     writer.close();
145 
146     int num = atLeast(13);
147     for (int i = 0; i < num; i++) {
148       BytesRef bytes = new BytesRef("" + term);
149       IndexReaderContext topReaderContext = reader.getContext();
150       for (LeafReaderContext leafReaderContext : topReaderContext.leaves()) {
151         PostingsEnum docsAndPosEnum = getDocsAndPositions(
152             leafReaderContext.reader(), bytes);
153         assertNotNull(docsAndPosEnum);
154         int initDoc = 0;
155         int maxDoc = leafReaderContext.reader().maxDoc();
156         // initially advance or do next doc
157         if (random().nextBoolean()) {
158           initDoc = docsAndPosEnum.nextDoc();
159         } else {
160           initDoc = docsAndPosEnum.advance(random().nextInt(maxDoc));
161         }
162         // now run through the scorer and check if all positions are there...
163         do {
164           int docID = docsAndPosEnum.docID();
165           if (docID == DocIdSetIterator.NO_MORE_DOCS) {
166             break;
167           }
168           Integer[] pos = positionsInDoc[leafReaderContext.docBase + docID];
169           assertEquals(pos.length, docsAndPosEnum.freq());
170           // number of positions read should be random - don't read all of them
171           // allways
172           final int howMany = random().nextInt(20) == 0 ? pos.length
173               - random().nextInt(pos.length) : pos.length;
174           for (int j = 0; j < howMany; j++) {
175             assertEquals("iteration: " + i + " initDoc: " + initDoc + " doc: "
176                 + docID + " base: " + leafReaderContext.docBase
177                 + " positions: " + Arrays.toString(pos) /* TODO: + " usePayloads: "
178                 + usePayload*/, pos[j].intValue(), docsAndPosEnum.nextPosition());
179           }
180 
181           if (random().nextInt(10) == 0) { // once is a while advance
182             if (docsAndPosEnum.advance(docID + 1 + random().nextInt((maxDoc - docID))) == DocIdSetIterator.NO_MORE_DOCS) {
183               break;
184             }
185           }
186 
187         } while (docsAndPosEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
188       }
189 
190     }
191     reader.close();
192     dir.close();
193   }
194 
195   public void testRandomDocs() throws IOException {
196     Directory dir = newDirectory();
197     RandomIndexWriter writer = new RandomIndexWriter(random(), dir,
198                                                      newIndexWriterConfig(new MockAnalyzer(random()))
199                                                        .setMergePolicy(newLogMergePolicy()));
200     int numDocs = atLeast(49);
201     int max = 15678;
202     int term = random().nextInt(max);
203     int[] freqInDoc = new int[numDocs];
204     FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
205     customType.setOmitNorms(true);
206     for (int i = 0; i < numDocs; i++) {
207       Document doc = new Document();
208       StringBuilder builder = new StringBuilder();
209       for (int j = 0; j < 199; j++) {
210         int nextInt = random().nextInt(max);
211         builder.append(nextInt).append(' ');
212         if (nextInt == term) {
213           freqInDoc[i]++;
214         }
215       }
216       doc.add(newField(fieldName, builder.toString(), customType));
217       writer.addDocument(doc);
218     }
219 
220     IndexReader reader = writer.getReader();
221     writer.close();
222 
223     int num = atLeast(13);
224     for (int i = 0; i < num; i++) {
225       BytesRef bytes = new BytesRef("" + term);
226       IndexReaderContext topReaderContext = reader.getContext();
227       for (LeafReaderContext context : topReaderContext.leaves()) {
228         int maxDoc = context.reader().maxDoc();
229         PostingsEnum postingsEnum = TestUtil.docs(random(), context.reader(), fieldName, bytes, null, PostingsEnum.FREQS);
230         if (findNext(freqInDoc, context.docBase, context.docBase + maxDoc) == Integer.MAX_VALUE) {
231           assertNull(postingsEnum);
232           continue;
233         }
234         assertNotNull(postingsEnum);
235         postingsEnum.nextDoc();
236         for (int j = 0; j < maxDoc; j++) {
237           if (freqInDoc[context.docBase + j] != 0) {
238             assertEquals(j, postingsEnum.docID());
239             assertEquals(postingsEnum.freq(), freqInDoc[context.docBase +j]);
240             if (i % 2 == 0 && random().nextInt(10) == 0) {
241               int next = findNext(freqInDoc, context.docBase+j+1, context.docBase + maxDoc) - context.docBase;
242               int advancedTo = postingsEnum.advance(next);
243               if (next >= maxDoc) {
244                 assertEquals(DocIdSetIterator.NO_MORE_DOCS, advancedTo);
245               } else {
246                 assertTrue("advanced to: " +advancedTo + " but should be <= " + next, next >= advancedTo);  
247               }
248             } else {
249               postingsEnum.nextDoc();
250             }
251           } 
252         }
253         assertEquals("docBase: " + context.docBase + " maxDoc: " + maxDoc + " " + postingsEnum.getClass(), DocIdSetIterator.NO_MORE_DOCS, postingsEnum.docID());
254       }
255       
256     }
257 
258     reader.close();
259     dir.close();
260   }
261   
262   private static int findNext(int[] docs, int pos, int max) {
263     for (int i = pos; i < max; i++) {
264       if( docs[i] != 0) {
265         return i;
266       }
267     }
268     return Integer.MAX_VALUE;
269   }
270 
271   /**
272    * tests retrieval of positions for terms that have a large number of
273    * occurrences to force test of buffer refill during positions iteration.
274    */
275   public void testLargeNumberOfPositions() throws IOException {
276     Directory dir = newDirectory();
277     RandomIndexWriter writer = new RandomIndexWriter(random(), dir,
278         newIndexWriterConfig(new MockAnalyzer(random())));
279     int howMany = 1000;
280     FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
281     customType.setOmitNorms(true);
282     for (int i = 0; i < 39; i++) {
283       Document doc = new Document();
284       StringBuilder builder = new StringBuilder();
285       for (int j = 0; j < howMany; j++) {
286         if (j % 2 == 0) {
287           builder.append("even ");
288         } else {
289           builder.append("odd ");
290         }
291       }
292       doc.add(newField(fieldName, builder.toString(), customType));
293       writer.addDocument(doc);
294     }
295 
296     // now do searches
297     IndexReader reader = writer.getReader();
298     writer.close();
299 
300     int num = atLeast(13);
301     for (int i = 0; i < num; i++) {
302       BytesRef bytes = new BytesRef("even");
303 
304       IndexReaderContext topReaderContext = reader.getContext();
305       for (LeafReaderContext leafReaderContext : topReaderContext.leaves()) {
306         PostingsEnum docsAndPosEnum = getDocsAndPositions(
307             leafReaderContext.reader(), bytes);
308         assertNotNull(docsAndPosEnum);
309 
310         int initDoc = 0;
311         int maxDoc = leafReaderContext.reader().maxDoc();
312         // initially advance or do next doc
313         if (random().nextBoolean()) {
314           initDoc = docsAndPosEnum.nextDoc();
315         } else {
316           initDoc = docsAndPosEnum.advance(random().nextInt(maxDoc));
317         }
318         String msg = "Iteration: " + i + " initDoc: " + initDoc; // TODO: + " payloads: " + usePayload;
319         assertEquals(howMany / 2, docsAndPosEnum.freq());
320         for (int j = 0; j < howMany; j += 2) {
321           assertEquals("position missmatch index: " + j + " with freq: "
322               + docsAndPosEnum.freq() + " -- " + msg, j,
323               docsAndPosEnum.nextPosition());
324         }
325       }
326     }
327     reader.close();
328     dir.close();
329   }
330   
331   public void testDocsEnumStart() throws Exception {
332     Directory dir = newDirectory();
333     RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
334     Document doc = new Document();
335     doc.add(newStringField("foo", "bar", Field.Store.NO));
336     writer.addDocument(doc);
337     DirectoryReader reader = writer.getReader();
338     LeafReader r = getOnlySegmentReader(reader);
339     PostingsEnum disi = TestUtil.docs(random(), r, "foo", new BytesRef("bar"), null, PostingsEnum.NONE);
340     int docid = disi.docID();
341     assertEquals(-1, docid);
342     assertTrue(disi.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
343     
344     // now reuse and check again
345     TermsEnum te = r.terms("foo").iterator();
346     assertTrue(te.seekExact(new BytesRef("bar")));
347     disi = TestUtil.docs(random(), te, disi, PostingsEnum.NONE);
348     docid = disi.docID();
349     assertEquals(-1, docid);
350     assertTrue(disi.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
351     writer.close();
352     r.close();
353     dir.close();
354   }
355   
356   public void testDocsAndPositionsEnumStart() throws Exception {
357     Directory dir = newDirectory();
358     RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
359     Document doc = new Document();
360     doc.add(newTextField("foo", "bar", Field.Store.NO));
361     writer.addDocument(doc);
362     DirectoryReader reader = writer.getReader();
363     LeafReader r = getOnlySegmentReader(reader);
364     PostingsEnum disi = r.postings(new Term("foo", "bar"), PostingsEnum.ALL);
365     int docid = disi.docID();
366     assertEquals(-1, docid);
367     assertTrue(disi.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
368     
369     // now reuse and check again
370     TermsEnum te = r.terms("foo").iterator();
371     assertTrue(te.seekExact(new BytesRef("bar")));
372     disi = te.postings(disi, PostingsEnum.ALL);
373     docid = disi.docID();
374     assertEquals(-1, docid);
375     assertTrue(disi.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
376     writer.close();
377     r.close();
378     dir.close();
379   }
380 }